http://kdd.ics.uci.edu/databases/kddcup99/task.html
Here is a paper that analyzes the dataset https://web.cs.dal.ca/~zincir/bildiri/pst05-gnm.pdf
In [5]:
    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import model_selection, linear_model, cluster, \
    preprocessing, metrics, pipeline, tree, ensemble, decomposition
pd.options.display.max_columns = 1000
%matplotlib inline
    
In [6]:
    
num_cluster = 30
    
In [7]:
    
columns = [f.split(":")[0] for f in """
duration: continuous.
protocol_type: symbolic.
service: symbolic.
flag: symbolic.
src_bytes: continuous.
dst_bytes: continuous.
land: symbolic.
wrong_fragment: continuous.
urgent: continuous.
hot: continuous.
num_failed_logins: continuous.
logged_in: symbolic.
num_compromised: continuous.
root_shell: continuous.
su_attempted: continuous.
num_root: continuous.
num_file_creations: continuous.
num_shells: continuous.
num_access_files: continuous.
num_outbound_cmds: continuous.
is_host_login: symbolic.
is_guest_login: symbolic.
count: continuous.
srv_count: continuous.
serror_rate: continuous.
srv_serror_rate: continuous.
rerror_rate: continuous.
srv_rerror_rate: continuous.
same_srv_rate: continuous.
diff_srv_rate: continuous.
srv_diff_host_rate: continuous.
dst_host_count: continuous.
dst_host_srv_count: continuous.
dst_host_same_srv_rate: continuous.
dst_host_diff_srv_rate: continuous.
dst_host_same_src_port_rate: continuous.
dst_host_srv_diff_host_rate: continuous.
dst_host_serror_rate: continuous.
dst_host_srv_serror_rate: continuous.
dst_host_rerror_rate: continuous.
dst_host_srv_rerror_rate: continuous.
""".split("\n") if len(f)>0]
columns.append("Category")
print(columns)
    
    
In [8]:
    
df = pd.read_csv("/data/kddcup.data", header=None, names=columns)
    
In [9]:
    
df.head()
    
    Out[9]:
In [10]:
    
df.Category.value_counts()
    
    Out[10]:
Attacks fall into one of four categories: User to Root; Remote to Local; Denial of Service; and Probe.
Mapping is below.
In [11]:
    
attack_types = {
 'normal.': "normal", 
 'buffer_overflow.':'u2r', 
 'loadmodule.':'u2r', 
 'perl.':'u2r', 
 'neptune.':'dos',
 'smurf.':'dos',
 'guess_passwd.':'r2l', 
 'pod.': 'dos', 
 'teardrop.':'dos',
 'portsweep.':'probe',
 'ipsweep.':'probe',
 'land.':'dos',
 'ftp_write.':'r2l',
 'back.': 'dos',
 'imap.': 'r2l',
 'satan.': 'probe',
 'phf.':'r2l',
 'nmap.':'probe',
 'multihop.':'r2l',
 'warezmaster.':'r2l',
 'warezclient.':'r2l',
 'spy.':'r2l',
 'rootkit.':'u2r'}
    
In [12]:
    
df["label"] = np.where(df.Category == "normal.", "normal", "attack")
df["attack_type"] = df.Category.apply(lambda r: attack_types[r])
    
In [13]:
    
df.label.value_counts()/df.shape[0]
    
    Out[13]:
In [14]:
    
df.attack_type.value_counts(dropna=False)
    
    Out[14]:
In [15]:
    
df_num = df.select_dtypes(include=[np.float64, np.int64])
df_num.head()
    
    Out[15]:
In [16]:
    
X = preprocessing.StandardScaler().fit_transform(df_num)
    
In [235]:
    
%%time
def display_2d(X, n_samples = 10000):
    pca = decomposition.PCA(n_components=2)
    pca_values = pca.fit_transform(X)
    X_pca = pca_values.copy()
    X_pca = pd.DataFrame(X_pca)
    X_pca["color"] = np.where(labels == "attack", "red", "green")
    X_sample = X_pca.sample(n_samples)
    colors = X_sample.color
    X_sample.plot.scatter(0, 1, color = colors)
    return pca_values
X_pca = display_2d(X)
    
    
    
In [237]:
    
%%time
y = preprocessing.LabelEncoder().fit_transform(df.label)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_pca, y, test_size = 0.3, random_state = 1)
est = tree.DecisionTreeClassifier(max_depth=5)
est.fit(X_train, y_train)
print("Accuracy:", est.score(X_test, y_test))
    
    
In [242]:
    
est.feature_importances_
    
    Out[242]:
In [39]:
    
pca = decomposition.PCA()
pca.fit(X)
    
    Out[39]:
In [43]:
    
_, ax = plt.subplots(figsize = (10, 6))
pd.Series(pca.explained_variance_ratio_).plot.bar(ax = ax)
pd.Series(np.cumsum(pca.explained_variance_ratio_)).plot.line(ax = ax)
    
    Out[43]:
    
In [171]:
    
pd.DataFrame({"cumsum": np.cumsum(pca.explained_variance_ratio_)}).query("cumsum>=0.99").head()
    
    Out[171]:
In [18]:
    
%%time
pca = decomposition.PCA(n_components=25)
X_pca = pca.fit_transform(X)
    
    
In [19]:
    
%%time
kmeans = cluster.MiniBatchKMeans(n_clusters=num_cluster)
y_cluster = kmeans.fit_predict(X_pca)
    
    
In [20]:
    
pd.Series(y_cluster).value_counts()
    
    Out[20]:
In [224]:
    
for i in range(num_cluster):
    print("Cluster: ", i, "")
    print(pd.Series(df.Category[y_cluster == i]).value_counts())
    print("\n")
    
    
In [178]:
    
distances = np.zeros([df.shape[0]])
for i in range(num_cluster):
    centroid = kmeans.cluster_centers_[i]
    distances[y_cluster==i] = np.sqrt(np.sum((X_pca[y_cluster==i] - centroid)**2, axis = 1))
np.sort(distances)[::-1][:100]
    
    Out[178]:
In [180]:
    
np.sum(distances ** 2), kmeans.inertia_
    
    Out[180]:
In [ ]:
    
Average distance of a point to its closest centroid to within each cluster
    
In [201]:
    
cluster_avg_distances = []
for i in range(num_cluster):
    cluster_avg_distances.append(np.mean(distances[y_cluster == i]))
pd.Series(cluster_avg_distances).sort_values(ascending=False).plot.bar()
    
    Out[201]:
    
In [203]:
    
cluster_max_distances = []
for i in range(num_cluster):
    cluster_max_distances.append(np.max(distances[y_cluster == i]))
pd.Series(cluster_max_distances).sort_values(ascending=False).plot.bar()
    
    Out[203]:
    
In [143]:
    
pd.Series(df.label[distances>113]).value_counts()
    
    Out[143]:
In [145]:
    
plt.boxplot(distances);
    
    
In [162]:
    
def outliers(distances):
    q1, q3 = np.percentile(distances, [0.25, 0.75])
    iqr = q3-q1
    upper_whisker = q3 + 1.5 * iqr
    lower_whisker = q1 - 1.5 * iqr
    return (distances > upper_whisker) | (distances < lower_whisker)
    
In [166]:
    
pd.Series(outliers(distances)).value_counts()
    
    Out[166]:
In [122]:
    
np.sqrt(np.sum((X_pca[y_cluster==i] - centroid)**2, axis = 1)).shape
    
    Out[122]:
In [175]:
    
X_pca[y_cluster==i].shape
    
    Out[175]:
In [ ]: